* Review of Economics and Statistics MS 23551

/*

This Stata(14) program creates the analysis datasets used in Analysis_final.do and Analysis_appendix.do.

*/

***************************************************************
*   Step 1: construct data for the main analysis
*   reg_mon_main.dta 
*   reg_semiannual_main.dta 
*   reg_crsection_main.dta 
*	reg_yr_main_step1.dta 
*	reg_yr_all.dta 
***************************************************************	

use "$workdir/data_monthly", clear // Dataset with linked income, consumption, saving, debt, cash withdraw, wealth, demographics, and anonymized worker and firm ids.

** restrict the sample to employees who continuously receive paychecks via the Bank during the 24-month sample period, 
   ** have a credit card account at the Bank and actively use it every month
keep if !missing(company_no)
bysort id: gen has_income_month = income!=0 & income<.
bysort id: egen has_income_months = total(has_income_month)
bysort id: gen has_cons_month = con!=0 & con<.
bysort id: egen has_cons_months = total(has_cons_month)
keep if has_income_months == 24 & has_cons_months == 24

** restrict the sample to employees who stay at the same firm throughout the sample period
bysort id company_no: gen firm_worked_for = 1 if _n==1
bysort id: egen num_firms_worked_for = total(firm_worked_for)
gen stayer = num_firms_worked_for == 1
keep if stayer == 1 

** restrict the sample to employees from 18 to 65 years old, earning at least the minimum income for full time workers
keep if inrange(age,18,65)
recode yyyymm (201307/201406 = 1) (201407/201506 = 2), generate(year)
gen     minimum_income = 19047 if year==1
replace minimum_income = 19273 if year==2
bysort id: egen full_time_months = total(income>minimum_income & income<.)     
keep if full_time_months == 24

** restrict the sample to employees in firms with 20 workers or more
keep if occupt!="selfemploy"
keep if firmsize>=20

** generate the monthly flows in/out of savings and debt
sort id yyyymm
by id: gen saving_accum   = saving_bal[_n]   - saving_bal[_n-1]   // deposits   
by id: gen carddebt_accum = carddebt_bal[_n] - carddebt_bal[_n-1] // credit card debt
by id: gen loan_accum     = loan_bal[_n]     - loan_bal[_n-1]     // installment loan
by id: gen debt_accum     = debt_bal[_n]     - debt_bal[_n-1]     // all debt
by id: gen savdebt_accum  = (saving_bal[_n]  - saving_bal[_n-1]) - (debt_bal[_n]     - debt_bal[_n-1])
by id: gen savcard_accum  = (saving_bal[_n]  - saving_bal[_n-1]) - (carddebt_bal[_n] - carddebt_bal[_n-1])

save "$workdir/reg_mon_step1", replace

* annual consumption regression data
use "$workdir/reg_mon_step1", clear
	sort id yyyymm
	collapse (sum)  income con saving_accum carddebt_accum loan_accum debt_accum savdebt_accum savcard_accum cash1 cash2 ///
	         (last) saving_bal carddebt_bal loan_bal debt_bal age age2 gender marriage education education_1-education_5 ocpclass ocpclass_1-ocpclass_3 firmsize ///
	         (mean) accountwealth , by(id year main_credit_card  industry region company_no)   
	bysort id: egen wealth = mean(accountwealth) // average account wealth over sample period
	bysort id: egen inctmp = mean(income)        // average income over sample period
	gen wealthtoincome = wealth / inctmp // the wealth to income ratio (average wealth over average income for the sample period)
	foreach var of varlist *_accum {
		gen `var'_2inc = `var' / income // annual flow in/out of savings and debt, normalized by income
	}
	drop inctmp
	gen logincome=log(income)
	gen logcon=log(con)
save "$workdir/reg_yr_step1", replace

* main credit card sample
use "$workdir/reg_yr_step1",clear

	keep if main_credit_card == 1 // restrict to workers who use the Bank as the main credit card bank

	** between component of income
	bysort company_no year: egen firmAvgIncome = mean(income) 
	gen log_income_between = log(firmAvgIncome)

	** within component of income
	gen log_income_within = logincome-log_income_between    

	** residual measures of consumption, income, and income components
	reg logcon   age age2 gender marriage education_2-education_5   
	predict logcon_r, resid
	quietly sum logcon if e(sample)
	replace logcon_r=logcon_r + r(mean)

	reg logincome age age2 gender marriage education_2-education_5    
	predict logincome_r,residuals
	quietly sum logincome if e(sample)
	replace logincome_r=logincome_r + r(mean)

	gen income_r=exp(logincome_r)
	bysort company_no year: egen firmAvgIncome_r=mean(income_r)
	gen log_income_between_r=log(firmAvgIncome_r)
	gen log_income_within_r=logincome_r-log_income_between_r

save "$workdir/reg_yr_main_step1", replace

* all direct depositor sample
use "$workdir/reg_yr_step1",clear

	** between component of income
	bysort company_no year: egen firmAvgIncome=mean(income) 
	gen log_income_between = log(firmAvgIncome)

	** within component of income
	gen log_income_within=logincome-log_income_between

	** residual measures of consumption, income, and income components
	reg logcon   age age2 gender marriage education_2-education_5   
	predict logcon_r, resid
	quietly sum logcon if e(sample)
	replace logcon_r=logcon_r + r(mean)

	reg logincome age age2 gender marriage education_2-education_5    
	predict logincome_r,residuals
	quietly sum logincome if e(sample)
	replace logincome_r=logincome_r + r(mean)

	gen income_r=exp(logincome_r)
	bysort company_no year: egen firmAvgIncome_r=mean(income_r)
	gen log_income_between_r=log(firmAvgIncome_r)
	gen log_income_within_r=logincome_r-log_income_between_r

save "$workdir/reg_yr_all", replace


* monthly consumption regression data
use "$workdir/reg_mon_step1", clear

	keep if main_credit_card == 1 // restrict to workers who use the Bank as the main credit card bank
	gen logincome=log(income)
	gen logcon=log(con)

	** between component of income
	bysort company_no yyyymm: egen firmAvgIncome=mean(income)
	gen log_income_between=log(firmAvgIncome)
	
	** within component of income
	gen log_income_within=logincome-log_income_between

	gen ym = ym(floor(yyyymm/100),mod(yyyymm,100))
	xtset id ym

save "$workdir/reg_mon_main", replace

*semi annual consumption regression data
use "$workdir/reg_mon_step1", clear

	recode yyyymm (201307/201312 = 1) (201401/201406 = 2) (201407/201412 = 3) (201501/201506 = 4), generate(halfyear)
	collapse (sum) income con , by(id halfyear main_credit_card company_no)  

	keep if main_credit_card == 1 // restrict to workers who use the Bank as the main credit card bank
	gen logincome=log(income)
	gen logcon=log(con)

	** between component of income
	bysort company_no halfyear: egen firmAvgIncome=mean(income)
	gen log_income_between=log(firmAvgIncome)

	** within component of income
	gen log_income_within=logincome-log_income_between
	
	xtset id halfyear
save "$workdir/reg_semiannual_main",replace

*cross sectional consumption regression data
use "$workdir/reg_mon_step1", clear

	collapse (sum) income con , by(id main_credit_card company_no)
	replace income = income/2
	replace con = con/2

	keep if main_credit_card == 1 // restrict to workers who use the Bank as the main credit card bank
	gen logincome=log(income)
	gen logcon=log(con)

	** between component of income
	bysort company_no: egen firmAvgIncome=mean(income)
	gen log_income_between=log(firmAvgIncome)

	** within component of income
	gen log_income_within=logincome-log_income_between

save "$workdir/reg_crsection_main", replace


***************************************************************
*   Step 2: construct the channel variables
*   nmcluster.dta (-> reg_yr_main.dta):  data with optimal k-means income clusters for the workers in the firms
***************************************************************	

use "$workdir/reg_mon_step1",clear  
collapse (mean) meanincome = income, by(company_no id)   
gen logmeanincome=log(meanincome)
save "$workdir/tmpcls", replace

program define optimalcluster
	* input :  meanincome, id, company_no for all workers in the same company_no
	* output:  id, company_no, cls, maxcls for all workers in the same company_no
	egen z_logmeanmincome=std(logmeanincome)      
	forvalues k = 1(1)10 {
		cluster kmeans z_logmeanmincome, k(`k') start(random(123)) name(cs`k')
	}
	matrix WSS = J(10,4,.)
	matrix colnames WSS = k lWSS fsdiff sndiff
	forvalues k = 1(1)10 {
		quietly anova z_logmeanmincome cs`k'
		scalar ws`k' =  e(rss)
		matrix WSS[`k', 1] = `k'
		matrix WSS[`k', 2] = log(ws`k')
		matrix WSS[`k', 3] = (WSS[`k'-1,2] - WSS[`k',2])
		matrix WSS[`k', 4] = (WSS[`k'-1,3] - WSS[`k',3])
	}
	matrix list WSS
	*max 
	scalar a=2
	scalar b=el(WSS,3,4)
	forvalues k = 4(1)10 {
		if  el(WSS,`k',4) <0 {
			continue, break // if the code breaks here, then a = k-2
		}
		else if el(WSS,`k',4)> b {
			scalar a=`k'-1
			scalar b=el(WSS,`k',4)
		}
	}
	scalar drop b
	local v=a
	keep company_no id  cs`v'
	rename cs`v' cls
	egen maxcls=max(cls)
	scalar drop a
end

clear
gen company_no = .
save "$workdir/nmcluster", replace

use "$workdir/tmpcls", clear
levelsof company_no, local(company_list)
foreach company of local company_list {
	use "$workdir/tmpcls" if company_no==`company', clear
	optimalcluster
	save "$workdir/temp_nmcluster", replace

	use "$workdir/nmcluster", clear
	append using "$workdir/temp_nmcluster"
	save "$workdir/nmcluster", replace
}

use "$workdir/reg_yr_main_step1", clear
** merge in the optimal k-means income clusters for the workers in the firms
merge m:1 id using "$workdir/nmcluster"
drop if _merge==2
drop _merge  
bysort company_no cls year: egen ClusterAvgIncome=mean(income)
gen log_income_cluster=log(ClusterAvgIncome)
gen log_income_between_cluster = log_income_cluster - log_income_between
gen log_income_within_cluster  = logincome - log_income_cluster
drop ClusterAvgIncome
bysort company_no cls year: egen ClusterAvgIncome_r=mean(income_r)
gen log_income_cluster_r=log(ClusterAvgIncome_r)
gen log_income_between_cluster_r = log_income_cluster_r - log_income_between_r
gen log_income_within_cluster_r  = logincome_r - log_income_cluster_r
drop ClusterAvgIncome_r 
save "$workdir/reg_yr_main", replace


***************************************************************
*   reg_mon_main_peereffect.dta: data with monthly consumption and asset flow variables for the peer effect regressions
***************************************************************	

use "$workdir/reg_mon_main", clear

merge m:1 id using "$workdir/nmcluster"  
drop if _merge==2
drop _merge  

sort company_no cls yyyymm id 
by company_no cls yyyymm: gen Nworkers = _N
by company_no cls yyyymm: egen Totalconsumption = total(con)
by company_no cls yyyymm: egen Totalincome = total(income)
gen peer_con = (Totalconsumption-con)/(Nworkers-1)
gen peer_inc = (Totalincome - income)/(Nworkers-1)

xtset id ym
gen logself_con_lag=L.logcon
gen logpeer_con_lag=log(L.peer_con)
gen logpeer_inc=log(peer_inc)

* IHS (inverse hyperbolic sine) of saving and debt flows
* IHS(x) = log( x + (x^2+1)^(1/2) ) interprets similarly as the logarithm but can handle negative values
gen hsaving_accum=log(saving_accum+((saving_accum)^2+1)^(1/2))   
gen hdebt_accum=log(debt_accum+((debt_accum)^2+1)^(1/2))

gen hself_con_lag=log(L.con+((L.con)^2+1)^(1/2))
gen hpeer_con_lag=log(L.peer_con+((L.peer_con)^2+1)^(1/2))
gen hpeer_inc=log(peer_inc+((peer_inc)^2+1)^(1/2))
gen hself_inc=log(income+((income)^2+1)^(1/2))

save "$workdir/reg_mon_main_peereffect", replace

